In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from warnings import filterwarnings 
filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\car data.csv")
data
Out[2]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
0 ritz 2014 3.35 5.59 27000 Petrol Dealer Manual 0
1 sx4 2013 4.75 9.54 43000 Diesel Dealer Manual 0
2 ciaz 2017 7.25 9.85 6900 Petrol Dealer Manual 0
3 wagon r 2011 2.85 4.15 5200 Petrol Dealer Manual 0
4 swift 2014 4.60 6.87 42450 Diesel Dealer Manual 0
... ... ... ... ... ... ... ... ... ...
296 city 2016 9.50 11.60 33988 Diesel Dealer Manual 0
297 brio 2015 4.00 5.90 60000 Petrol Dealer Manual 0
298 city 2009 3.35 11.00 87934 Petrol Dealer Manual 0
299 city 2017 11.50 12.50 9000 Diesel Dealer Manual 0
300 brio 2016 5.30 5.90 5464 Petrol Dealer Manual 0

301 rows × 9 columns

In [3]:
data.head()
Out[3]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
0 ritz 2014 3.35 5.59 27000 Petrol Dealer Manual 0
1 sx4 2013 4.75 9.54 43000 Diesel Dealer Manual 0
2 ciaz 2017 7.25 9.85 6900 Petrol Dealer Manual 0
3 wagon r 2011 2.85 4.15 5200 Petrol Dealer Manual 0
4 swift 2014 4.60 6.87 42450 Diesel Dealer Manual 0
In [4]:
data.tail()
Out[4]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
296 city 2016 9.50 11.6 33988 Diesel Dealer Manual 0
297 brio 2015 4.00 5.9 60000 Petrol Dealer Manual 0
298 city 2009 3.35 11.0 87934 Petrol Dealer Manual 0
299 city 2017 11.50 12.5 9000 Diesel Dealer Manual 0
300 brio 2016 5.30 5.9 5464 Petrol Dealer Manual 0
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
In [6]:
data.isnull().sum()
Out[6]:
Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64
In [7]:
data.duplicated().sum()
Out[7]:
2
In [8]:
data.drop_duplicates(inplace=True)
In [9]:
data.duplicated().sum()
Out[9]:
0
In [10]:
data.describe()
Out[10]:
Year Selling_Price Present_Price Kms_Driven Owner
count 299.000000 299.000000 299.000000 299.000000 299.000000
mean 2013.615385 4.589632 7.541037 36916.752508 0.043478
std 2.896868 4.984240 8.567887 39015.170352 0.248720
min 2003.000000 0.100000 0.320000 500.000000 0.000000
25% 2012.000000 0.850000 1.200000 15000.000000 0.000000
50% 2014.000000 3.510000 6.100000 32000.000000 0.000000
75% 2016.000000 6.000000 9.840000 48883.500000 0.000000
max 2018.000000 35.000000 92.600000 500000.000000 3.000000
In [11]:
#VISUALIZATION
In [12]:
plt.bar(data['Seller_Type'],data['Present_Price'])
plt.xticks(rotation=90)
plt.show()
In [13]:
fig=px.bar(data,x='Car_Name',y='Present_Price',color='Car_Name')
fig.show()
In [14]:
fig=px.violin(data,x='Car_Name',y='Fuel_Type',color='Car_Name')
fig.show()
In [15]:
fig=px.bar(data,x='Seller_Type',y='Owner',color='Owner')
fig.show()
In [16]:
plt.bar(data['Fuel_Type'],data['Transmission'])
plt.scatter(data['Seller_Type'],data['Fuel_Type'],color='red')
plt.xticks(rotation=90)
plt.show()
In [17]:
plt.figure(figsize=(10,4))
sns.countplot(x='Fuel_Type', data=data, color='b')
plt.title('Car ICE')
plt.show()
In [18]:
plt.figure(figsize=(10,4))
top_car = data['Transmission'].value_counts().nlargest(10)
sns.countplot(y=data.Transmission, order=top_car.index, color='red')
Out[18]:
<AxesSubplot:xlabel='count', ylabel='Transmission'>
In [19]:
sns.lineplot(x='Year', y='Present_Price', data=data).set_title('Variation of present price with year')
Out[19]:
Text(0.5, 1.0, 'Variation of present price with year')
In [20]:
sns.barplot(data['Present_Price'],data['Seller_Type'],color='r')
plt.xticks(rotation=90)
plt.show()
In [21]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Year', y='Kms_Driven')
plt.title('YEAR vs. KMS_DRIVEN')
plt.xlabel('Year')
plt.ylabel('Kms_Driven')
plt.show()
In [22]:
sns.displot(data["Kms_Driven"])
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x1c1c5da4280>
In [23]:
sns.relplot(x='Year',y='Present_Price',data=data)
Out[23]:
<seaborn.axisgrid.FacetGrid at 0x1c1c5f2cac0>
In [24]:
sns.countplot(x='Fuel_Type',data=data)
plt.xticks(rotation=90)
Out[24]:
(array([0, 1, 2]),
 [Text(0, 0, 'Petrol'), Text(1, 0, 'Diesel'), Text(2, 0, 'CNG')])
In [25]:
sns.boxplot(x='Selling_Price',y='Fuel_Type',data=data)
Out[25]:
<AxesSubplot:xlabel='Selling_Price', ylabel='Fuel_Type'>
In [26]:
sns.violinplot(x='Owner',y='Kms_Driven',data=data)
Out[26]:
<AxesSubplot:xlabel='Owner', ylabel='Kms_Driven'>
In [27]:
#MODEL BUILDING
In [28]:
data['Age']=2024-data['Year']
data.drop('Year',axis=1,inplace=True)
In [29]:
data.head()
Out[29]:
Car_Name Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner Age
0 ritz 3.35 5.59 27000 Petrol Dealer Manual 0 10
1 sx4 4.75 9.54 43000 Diesel Dealer Manual 0 11
2 ciaz 7.25 9.85 6900 Petrol Dealer Manual 0 7
3 wagon r 2.85 4.15 5200 Petrol Dealer Manual 0 13
4 swift 4.60 6.87 42450 Diesel Dealer Manual 0 10
In [30]:
data.rename(columns={'Selling_Price':'Selling_Price(lacs)','Present_Price':'Present_Price(lacs)','Owner':'past_owner'},inplace=True)
In [31]:
data.columns
Out[31]:
Index(['Car_Name', 'Selling_Price(lacs)', 'Present_Price(lacs)', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'past_owner', 'Age'],
      dtype='object')
In [32]:
cat_cols=['Fuel_Type','Seller_Type','Transmission','past_owner']
i=0
while i<4:
    fig=plt.figure(figsize=[10,4])
    plt.subplot(1,2,1)
    sns.countplot(x=cat_cols[i],data=data)
    i+=1
    plt.subplot(1,2,2)
    sns.countplot(x=cat_cols[i],data=data)
    i+=1
    plt.show()
In [33]:
num_cols=['Selling_Price(lacs)','Present_Price(lacs)','Kms_Driven','Age']
i=0
while i<4:
    fig=plt.figure(figsize=[13,3])
    plt.subplot(1,2,1)
    sns.boxplot(x=num_cols[i],data=data)
    
    i += 1
    
    plt.subplot(1,2,2)
    sns.boxplot(x=num_cols[i],data=data)
    
    i += 1
    
    plt.show()
In [34]:
data.drop(labels = 'Car_Name', axis = 1, inplace = True)
In [35]:
data = pd.get_dummies(data = data,drop_first = True)
In [36]:
sns.heatmap(data.corr(), annot=True, cmap="RdBu")
plt.show()
In [37]:
data.corr()['Selling_Price(lacs)']
Out[37]:
Selling_Price(lacs)       1.000000
Present_Price(lacs)       0.876378
Kms_Driven                0.028566
past_owner               -0.087880
Age                      -0.234369
Fuel_Type_Diesel          0.543541
Fuel_Type_Petrol         -0.531636
Seller_Type_Individual   -0.553851
Transmission_Manual      -0.348869
Name: Selling_Price(lacs), dtype: float64
In [38]:
x=data.drop('Selling_Price(lacs)',axis=1)
y=data['Selling_Price(lacs)']
In [39]:
x.head()
Out[39]:
Present_Price(lacs) Kms_Driven past_owner Age Fuel_Type_Diesel Fuel_Type_Petrol Seller_Type_Individual Transmission_Manual
0 5.59 27000 0 10 0 1 0 1
1 9.54 43000 0 11 1 0 0 1
2 9.85 6900 0 7 0 1 0 1
3 4.15 5200 0 13 0 1 0 1
4 6.87 42450 0 10 1 0 0 1
In [40]:
y.head()
Out[40]:
0    3.35
1    4.75
2    7.25
3    2.85
4    4.60
Name: Selling_Price(lacs), dtype: float64
In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2,random_state=1)
print("x train: ",x_train.shape)
print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)
x train:  (239, 8)
x test:  (60, 8)
y train:  (239,)
y test:  (60,)
In [42]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
In [43]:
print("coefficients:",model.coef_)
print("intercept:", model.intercept_)
coefficients: [ 5.29181181e-01 -5.99046152e-06 -9.44252885e-01 -4.18029869e-01
  2.12111606e+00  4.98109745e-01 -4.73063346e-01 -5.27393308e-01]
intercept: 5.087110830354567
In [44]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,y_pred)
print("mean squared error:",mse)
mean squared error: 5.132840514762981
In [45]:
error=y_test,-y_pred
print(error)
(175     0.38
289    10.11
52     18.00
148     0.52
216     2.90
199     0.12
276     8.65
63     23.50
28      1.95
203     2.95
114     1.15
121     1.05
11      6.85
246     3.75
112     1.15
297     4.00
74      4.90
107     1.25
267     8.35
109     1.20
190     0.20
12      7.50
71      4.50
19      2.65
89      4.75
110     1.20
300     5.30
178     0.35
122     1.05
252     5.40
86     35.00
298     3.35
134     0.65
287     5.75
133     0.72
79     14.50
255     3.00
140     0.60
91     11.25
191     0.20
4       4.60
195     0.18
156     0.48
249     5.25
81      4.75
16      7.25
170     0.40
129     0.78
59     19.99
187     0.25
281     2.10
213     2.90
176     0.35
60      6.95
152     0.50
214     5.25
193     0.20
285     7.40
174     0.38
141     0.60
Name: Selling_Price(lacs), dtype: float64, array([ 8.82443400e-01, -8.84467711e+00, -1.46540728e+01,  1.03979470e+00,
       -3.98298018e+00,  2.53231323e+00, -8.34385906e+00, -2.21937617e+01,
       -1.02884763e+00, -1.73742233e+00, -1.49855359e+00,  9.71867882e-02,
       -8.10595189e+00, -3.42494291e+00, -1.63256684e+00, -4.05829972e+00,
       -6.72891454e+00, -6.90350211e-01, -8.19447911e+00, -2.37055610e+00,
        2.06625579e+00, -6.99315577e+00, -7.58105566e+00, -3.18001822e+00,
       -4.21517490e+00, -2.14652470e+00, -4.80302540e+00, -1.09321399e+00,
       -1.82372418e+00, -5.24370518e+00, -4.93633368e+01, -4.08160698e+00,
       -2.01650415e+00, -5.52823031e+00, -1.74025186e+00, -1.78569533e+01,
       -2.55105014e+00, -4.29758011e-01, -1.06661636e+01,  1.22397566e+00,
       -5.88171452e+00, -7.82166925e-01, -1.84387213e+00, -5.63352745e+00,
       -6.83301858e+00, -8.37875925e+00,  2.53216644e-02, -2.07311472e+00,
       -2.18116746e+01, -6.46229964e-02, -1.25281187e+00, -3.02725664e+00,
        4.73697981e-01, -1.00679360e+01, -1.44188632e-02, -5.02745877e+00,
        7.57620831e-01, -6.64881130e+00, -9.72273736e-01, -1.54406068e+00]))
In [46]:
error=y_test-y_pred
data=pd.DataFrame({'y_pred':y_pred, 'error': error})
sns.regplot(x='y_pred', y='error', data=data)
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]: